setwd("/Users/aimhighfly/Documents/StonyBrookUniversity/Spring_semester/EST508/Projects/twitter")
library("twitteR")
library("tm")
## Loading required package: NLP
library("wordcloud")
## Loading required package: RColorBrewer
library("cluster")
library("FactoMineR")
library("RColorBrewer")
library("ggplot2")
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library("magrittr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:twitteR':
## 
##     id, location
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("googleVis")
## 
## Welcome to googleVis version 0.5.10
## 
## Please read the Google API Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
## 
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
## 
## See the googleVis package vignettes for more details,
## or visit http://github.com/mages/googleVis.
## 
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))

Set up twitter API

setup_twitter_oauth("OsgS5oe7vJWrbR68Tvjn9SIiI", 
                    "tzliYNHEKuUsy3wMaRFQPL8uiFKmRP4dKlsGpe6sdYWIzbDKck", 
                    "2833789056-RIryMGYAVdH5at7UD5k7YfDQEqgaEOCFlQrTQhF",   
                    "BCV8ZH590WI44orTFvQ4ukCb1zZazdNjFXkKrPz3fBtlw")
## [1] "Using direct authentication"


Donald Trump


Time Series

trump <- read.csv("realDonaldTrump_tweets.csv", stringsAsFactors = F)
str(trump)
## 'data.frame':    3173 obs. of  5 variables:
##  $ id        : num  7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
##  $ created_at: chr  "2016-04-05 03:29:30" "2016-04-05 03:16:32" "2016-04-05 02:00:43" "2016-04-05 01:14:09" ...
##  $ text      : chr  "b'MAKE AMERICA GREAT AGAIN!\\nhttps://t.co/iiXHgM7aA2'" "b'\"@FoxNews: @ScottBaio: \"#DonaldTrump is the only guy, I think, that has the will &amp; the nerve to attack &amp; to fight.\"| __truncated__ "b'\"@vikkideiter: Something VERY close to my heart. I\\'m a NAVY VET! I love @realDonaldTrump\\'s  VETERANS ADMINISTRATION REFO"| __truncated__ "b'I will be on @SeanHannity @FoxNews- tonight at 10pmE w/ @MELANIATRUMP, from Wisconsin. Enjoy! #WIPrimary #Trump2016 https://t"| __truncated__ ...
##  $ retweet   : int  977 1171 1460 1933 6271 2988 3719 3343 1662 2858 ...
##  $ favorite  : int  2266 2956 4474 5959 12606 9141 10085 9852 6128 9270 ...
#transforming Date and Time into seperate columns 
trump["time"] <- NA 
#initalizing time column 
trump$time <- trump$created_at 
#copying data from date-time column 
names(trump)[2]<-paste("date") 
#renaming first column to date 
trump$date <- substr(trump$date, 0, 10) 
#keeping date string 
trump$time <- substr(trump$time, 12,20) 
# add group name
trump <- cbind(trump, "name"="trump")
#keeping time string 
trump_most <- trump %>% group_by(date) %>% filter(retweet == max(retweet)) 

#Summarizing Retweets by Hour of Day example:
trump$time <- substr(trump$time, 1, 2) 
trump_t <- trump %>% group_by(time) %>% summarise(sum(retweet))


trump.anno <- gvisAnnotationChart(trump_most, 
                            datevar="date",
                            numvar="retweet", 
                            idvar="name",
                            options=list(
                              width=600, height=350,
                              fill=10, displayExactValues=TRUE,
                              colors="['blue']")
)
trump.anno
AnnotationChartID94265fb336e3

Data: trump_most • Chart ID: AnnotationChartID94265fb336e3 • googleVis-0.5.10
R version 3.2.4 (2016-03-10) • Google Terms of Use • Documentation and Data Policy

Wordcloud

# create corpus from vector
trump_corpus <- Corpus(VectorSource(trump$text))
trump_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3173
inspect(trump_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 53
# remove punctuation, lower case, remove numbers, cut out stopwords,  strip whitespace
trump_clean <- tm_map(trump_corpus, removePunctuation)
trump_clean <- tm_map(trump_clean, content_transformer(tolower))
trump_clean <- tm_map(trump_clean, removeWords, stopwords("english"))
trump_clean <- tm_map(trump_clean, removeNumbers)
trump_clean <- tm_map(trump_clean, stripWhitespace)
# trump_clean <- tm_map(trump_clean, removeWords, c())
inspect(trump_clean[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 42
wordcloud(trump_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))


Sentiment analysis

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array ("a") of scores back, so we use 
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}


pos.words = scan("positive-words.txt", what='character', comment.char=';') 
neg.words = scan("negative-words.txt", what='character', comment.char=';') 

trump_sentiment = score.sentiment(trump$text, pos.words, neg.words)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:twitteR':
## 
##     id
## Loading required package: stringr
table(trump_sentiment$score)
## 
##  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6 
##   3   7  26  67 159 374 825 898 496 212  87  14   5
hist(trump_sentiment$score)

boxplot(trump$retweet ~ trump_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Trump")

trump_sentiment_score <- as.factor(trump_sentiment$score)

qplot(trump_sentiment_score, trump$retweet, geom=c("boxplot"), color = trump_sentiment_score,
      main="Retweeted count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(trump_sentiment$score, trump$retweet, geom=c("point", "smooth"), 
      main="Retweeted count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(trump_sentiment_score, trump$favorite, geom=c("boxplot"), color = trump_sentiment_score,
      main="Favorite count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(trump_sentiment$score, trump$favorite, geom=c("point", "smooth"), 
      main="Favorite count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(trump_sentiment_score, trump$retweet/trump$favorite, geom=c("boxplot"), color = trump_sentiment_score,
      main="Retweeted count/Favorite count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 86 rows containing non-finite values (stat_boxplot).

qplot(trump_sentiment$score, trump$retweet/trump$favorite, geom=c("point", "smooth"), 
      main="Retweeted count/Favorite count VS Sentiment score of Trump", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 86 rows containing non-finite values (stat_smooth).



Hillary Clinton


Time Series

hillary <- read.csv("HillaryClinton_tweets.csv", stringsAsFactors = F)
str(hillary)
## 'data.frame':    3218 obs. of  5 variables:
##  $ id        : num  7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
##  $ created_at: chr  "2016-04-05 00:39:33" "2016-04-04 23:47:45" "2016-04-04 23:12:51" "2016-04-04 22:26:50" ...
##  $ text      : chr  "b\"How can you stop @GovWalker and the GOP's attacks on education and women's rights?\\n\\nVote: https://t.co/XmpM1irN6v https:"| __truncated__ "b'RT @TheBriefing2016: Hillary has earned the most votes \\xe2\\x9c\\x93\\nWhen more people vote, Hillary wins \\xe2\\x9c\\x93\"| __truncated__ "b'Born on this day in 1928, Maya Angelou\\xe2\\x80\\x99s voice holds a powerful place in the ongoing fight for justice. https:/"| __truncated__ "b'48 years ago, we lost a giant in the fight for equality. Let\\xe2\\x80\\x99s honor Dr. King and keep bending the arc of the m"| __truncated__ ...
##  $ retweet   : int  481 479 1728 1417 1042 269 7420 888 893 1956 ...
##  $ favorite  : int  1048 0 3996 3488 2090 544 0 2115 2444 4061 ...
#transforming Date and Time into seperate columns 
hillary["time"] <- NA 
#initalizing time column 
hillary$time <- hillary$created_at 
#copying data from date-time column 
names(hillary)[2]<-paste("date") 
#renaming first column to date 
hillary$date <- substr(hillary$date, 0, 10) 
#keeping date string 
hillary$time <- substr(hillary$time, 12,20) 
# add group name
hillary <- cbind(hillary, "name"="hillary")
#keeping time string 
hillary_most <- hillary %>% group_by(date) %>% filter(retweet == max(retweet)) 

#Summarizing Retweets by Hour of Day example:
hillary$time <- substr(hillary$time, 1, 2) 
hillary_t <- hillary %>% group_by(time) %>% summarise(sum(retweet))
hillary.anno <- gvisAnnotationChart(hillary_most, 
                            datevar="date",
                            numvar="retweet", 
                            idvar="name",
                            options=list(
                              width=600, height=350,
                              fill=10, displayExactValues=TRUE,
                              colors="['red']")
)
hillary.anno
AnnotationChartID942635ad853d

Data: hillary_most • Chart ID: AnnotationChartID942635ad853d • googleVis-0.5.10
R version 3.2.4 (2016-03-10) • Google Terms of Use • Documentation and Data Policy

Wordcloud

# create corpus
hillary_corpus <- Corpus(VectorSource(hillary$text))
hillary_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3218
inspect(hillary_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 142
# remove punctuation, lower case, remove numbers, cut out stopwords,  strip whitespace
hillary_clean <- tm_map(hillary_corpus, removePunctuation)
hillary_clean <- tm_map(hillary_clean, content_transformer(tolower))
hillary_clean <- tm_map(hillary_clean, removeWords, stopwords("english"))
hillary_clean <- tm_map(hillary_clean, removeNumbers)
hillary_clean <- tm_map(hillary_clean, stripWhitespace)
hillary_clean <- tm_map(hillary_clean, removeWords, c("brt"))
inspect(hillary_clean[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 101
wordcloud(hillary_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))


Sentiment analysis

hillary_sentiment = score.sentiment(hillary$text, pos.words, neg.words)
table(hillary_sentiment$score)
## 
##   -4   -3   -2   -1    0    1    2    3    4    5    6 
##   13   29  131  364 1448  838  301   80   11    2    1
hist(hillary_sentiment$score)

boxplot(hillary$retweet ~ hillary_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Clinton")

hillary_sentiment_score <- as.factor(hillary_sentiment$score)

qplot(hillary_sentiment$score, hillary$retweet, geom=c("boxplot"), color = hillary_sentiment_score,
      main="Retweeted count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(hillary_sentiment$score, hillary$retweet, geom=c("point", "smooth"), 
      main="Retweeted count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(hillary_sentiment$score, hillary$favorite, geom=c("boxplot"), color = hillary_sentiment_score,
      main="Favorite count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(hillary_sentiment$score, hillary$favorite, geom=c("point", "smooth"), 
      main="Favorite count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(hillary_sentiment$score, hillary$retweet/hillary$favorite, geom=c("boxplot"), color = hillary_sentiment_score,
      main="Retweeted count/Favorite count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 828 rows containing non-finite values (stat_boxplot).

qplot(hillary_sentiment$score, hillary$retweet/hillary$favorite, geom=c("point", "smooth"), 
      main="Retweeted count/Favorite count VS Sentiment score of Hillary", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 828 rows containing non-finite values (stat_smooth).



Ted Cruz


Time Series

ted <- read.csv("tedcruz_tweets.csv", stringsAsFactors = F)
str(ted)
## 'data.frame':    3216 obs. of  5 variables:
##  $ id        : num  7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
##  $ created_at: chr  "2016-04-05 02:39:52" "2016-04-05 02:17:10" "2016-04-05 01:38:02" "2016-04-05 01:34:04" ...
##  $ text      : chr  "b\"Congrats to #CruzCrew's Karen &amp; her husband who won our #NationalChampionship contest! Thank you for the support! https:"| __truncated__ "b'Thank you Waukesha! #ChooseCruz tomorrow: https://t.co/elr0EH0EBs https://t.co/uq14DboB4m'" "b'RT @megynkelly: .@tedcruz on abortion: \\xe2\\x80\\x9cI\\xe2\\x80\\x99m pro-life. I believe that we should protect every huma"| __truncated__ "b'RT @FoxNews: .@TedCruz: \\xe2\\x80\\x9cObamaCare is the biggest job killer in this country. Millions of Americans are hurting"| __truncated__ ...
##  $ retweet   : int  117 313 466 335 153 173 136 138 487 145 ...
##  $ favorite  : int  411 596 0 0 0 422 0 0 782 0 ...
#transforming Date and Time into seperate columns 
ted["time"] <- NA 
#initalizing time column 
ted$time <- ted$created_at 
#copying data from date-time column 
names(ted)[2]<-paste("date") 
#renaming first column to date 
ted$date <- substr(ted$date, 0, 10) 
#keeping date string 
ted$time <- substr(ted$time, 12,20) 
# add group name
ted <- cbind(ted, "name"="ted")
#keeping time string 
ted_most <- ted %>% group_by(date) %>% filter(retweet == max(retweet)) 

#Summarizing Retweets by Hour of Day example:
ted$time <- substr(ted$time, 1, 2) 
ted_t <- ted %>% group_by(time) %>% summarise(sum(retweet))
ted.anno <- gvisAnnotationChart(ted_most, 
                            datevar="date",
                            numvar="retweet", 
                            idvar="name",
                            options=list(
                              width=600, height=350,
                              fill=10, displayExactValues=TRUE,
                              colors="['green']")
)
ted.anno
AnnotationChartID94261da24c24

Data: ted_most • Chart ID: AnnotationChartID94261da24c24 • googleVis-0.5.10
R version 3.2.4 (2016-03-10) • Google Terms of Use • Documentation and Data Policy

Wordcloud

# create corpus
ted_corpus <- Corpus(VectorSource(ted$text))
ted_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3216
inspect(ted_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 144
# remove punctuation, lower case, remove numbers, cut out stopwords,  strip whitespace
ted_clean <- tm_map(ted_corpus, removePunctuation)
ted_clean <- tm_map(ted_clean, content_transformer(tolower))
ted_clean <- tm_map(ted_clean, removeWords, stopwords("english"))
ted_clean <- tm_map(ted_clean, removeNumbers)
ted_clean <- tm_map(ted_clean, stripWhitespace)
ted_clean <- tm_map(ted_clean, removeWords, c("brt"))
inspect(ted_clean[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 102
wordcloud(ted_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))


Sentiment analysis

ted_sentiment = score.sentiment(ted$text, pos.words, neg.words)
table(ted_sentiment$score)
## 
##   -5   -4   -3   -2   -1    0    1    2    3    4    5 
##    2    1   13   46  245 1714  894  233   53   13    2
hist(ted_sentiment$score)

boxplot(ted$retweet ~ ted_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Ted")

ted_sentiment_score <- as.factor(ted_sentiment$score)

qplot(ted_sentiment$score, ted$retweet, geom=c("boxplot"), color = ted_sentiment_score,
      main="Retweeted count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(ted_sentiment$score, ted$retweet, geom=c("point", "smooth"), 
      main="Retweeted count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(ted_sentiment$score, ted$favorite, geom=c("boxplot"), color = ted_sentiment_score,
      main="Favorite count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(ted_sentiment$score, ted$favorite, geom=c("point", "smooth"), 
      main="Favorite count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(ted_sentiment$score, ted$retweet/ted$favorite, geom=c("boxplot"), color = ted_sentiment_score,
      main="Retweeted count/Favorite count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 2038 rows containing non-finite values (stat_boxplot).

qplot(ted_sentiment$score, ted$retweet/ted$favorite, geom=c("point", "smooth"), 
      main="Retweeted count/Favorite count VS Sentiment score of Ted", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 2038 rows containing non-finite values (stat_smooth).



Bernie Sanders


Time Series

bernie <- read.csv("BernieSanders_tweets.csv", stringsAsFactors = F)
str(bernie)
## 'data.frame':    3184 obs. of  5 variables:
##  $ id        : num  7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
##  $ created_at: chr  "2016-04-05 01:17:04" "2016-04-05 00:25:28" "2016-04-04 23:56:07" "2016-04-04 23:22:33" ...
##  $ text      : chr  "b'\"The major media don\\xe2\\x80\\x99t know how to report on movements.\" \\xe2\\x80\\x94 @RBReich\\nhttps://t.co/86IOLPIkmC'" "b\"Making the American taxpayer pick up BP's bill for cleaning the disastrous Deepwater Horizon spill is an outrage. https://t."| __truncated__ "b\"A victory for workers fighting for collective bargaining rights at Trump's Vegas hotel. Unions make America great.\\nhttps:/"| __truncated__ "b'RT @ariannaijones: \\xe2\\x80\\x9cWhy I endorse Bernie Sanders for President of the United States\\xe2\\x80\\x9d by Barbara L"| __truncated__ ...
##  $ retweet   : int  580 1286 1067 583 3998 2563 1472 1151 2352 479 ...
##  $ favorite  : int  1282 2151 2486 0 7032 5921 4064 2801 6200 0 ...
#transforming Date and Time into seperate columns 
bernie["time"] <- NA 
#initalizing time column 
bernie$time <- bernie$created_at 
#copying data from date-time column 
names(bernie)[2]<-paste("date") 
#renaming first column to date 
bernie$date <- substr(bernie$date, 0, 10) 
#keeping date string 
bernie$time <- substr(bernie$time, 12,20) 
# add group name
bernie <- cbind(bernie, "name"="bernie")
#keeping time string 
bernie_most <- bernie %>% group_by(date) %>% filter(retweet == max(retweet)) 

#Summarizing Retweets by Hour of Day example:
bernie$time <- substr(bernie$time, 1, 2) 
bernie_t <- bernie %>% group_by(time) %>% summarise(sum(retweet))
bernie.anno <- gvisAnnotationChart(bernie_most, 
                            datevar="date",
                            numvar="retweet", 
                            idvar="name",
                            options=list(
                              width=600, height=350,
                              fill=10, displayExactValues=TRUE,
                              colors="['yellow']")
)
bernie.anno
AnnotationChartID942643862158

Data: bernie_most • Chart ID: AnnotationChartID942643862158 • googleVis-0.5.10
R version 3.2.4 (2016-03-10) • Google Terms of Use • Documentation and Data Policy

Wordcloud

# create corpus
bernie_corpus <- Corpus(VectorSource(bernie$text))
bernie_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3184
inspect(ted_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 144
# remove punctuation, lower case, remove numbers, cut out stopwords,  strip whitespace
bernie_clean <- tm_map(bernie_corpus, removePunctuation)
bernie_clean <- tm_map(bernie_clean, content_transformer(tolower))
bernie_clean <- tm_map(bernie_clean, removeWords, stopwords("english"))
bernie_clean <- tm_map(bernie_clean, removeNumbers)
bernie_clean <- tm_map(bernie_clean, stripWhitespace)
bernie_clean <- tm_map(bernie_clean, removeWords, c("brt"))
inspect(bernie_clean[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 77
wordcloud(bernie_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))


Sentiment analysis

bernie_sentiment = score.sentiment(bernie$text, pos.words, neg.words)
table(bernie_sentiment$score)
## 
##   -6   -5   -4   -3   -2   -1    0    1    2    3    4    5 
##    1    1   11   37  127  417 1544  755  236   48    6    1
hist(bernie_sentiment$score)

boxplot(bernie$retweet ~ bernie_sentiment$score, xlab = "Sentiment score", ylab = "Retweeted count of Bernie")

bernie_sentiment_score <- as.factor(bernie_sentiment$score)

qplot(bernie_sentiment$score, bernie$retweet, geom=c("boxplot"), color = bernie_sentiment_score,
      main="Retweeted count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(bernie_sentiment$score, bernie$retweet, geom=c("point", "smooth"), 
      main="Retweeted count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Retweeted count")

qplot(bernie_sentiment$score, bernie$favorite, geom=c("boxplot"), color = bernie_sentiment_score,
      main="Favorite count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(bernie_sentiment$score, bernie$favorite, geom=c("point", "smooth"), 
      main="Favorite count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Favorite count")

qplot(bernie_sentiment$score, bernie$retweet/bernie$favorite, geom=c("boxplot"), color = bernie_sentiment_score,
      main="Retweeted count/Favorite count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 1089 rows containing non-finite values (stat_boxplot).

qplot(bernie_sentiment$score, bernie$retweet/bernie$favorite, geom=c("point", "smooth"), 
      main="Retweeted count/Favorite count VS Sentiment score of Bernie", 
      xlab = "Sentiment score", ylab = "Retweeted count/Favorite count")
## Warning: Removed 1089 rows containing non-finite values (stat_smooth).



Comparison


Time Series

comparison <- rbind(trump, hillary, ted, bernie)
str(comparison)
## 'data.frame':    12791 obs. of  7 variables:
##  $ id      : num  7.17e+17 7.17e+17 7.17e+17 7.17e+17 7.17e+17 ...
##  $ date    : chr  "2016-04-05" "2016-04-05" "2016-04-05" "2016-04-05" ...
##  $ text    : chr  "b'MAKE AMERICA GREAT AGAIN!\\nhttps://t.co/iiXHgM7aA2'" "b'\"@FoxNews: @ScottBaio: \"#DonaldTrump is the only guy, I think, that has the will &amp; the nerve to attack &amp; to fight.\"| __truncated__ "b'\"@vikkideiter: Something VERY close to my heart. I\\'m a NAVY VET! I love @realDonaldTrump\\'s  VETERANS ADMINISTRATION REFO"| __truncated__ "b'I will be on @SeanHannity @FoxNews- tonight at 10pmE w/ @MELANIATRUMP, from Wisconsin. Enjoy! #WIPrimary #Trump2016 https://t"| __truncated__ ...
##  $ retweet : int  977 1171 1460 1933 6271 2988 3719 3343 1662 2858 ...
##  $ favorite: int  2266 2956 4474 5959 12606 9141 10085 9852 6128 9270 ...
##  $ time    : chr  "03" "03" "02" "01" ...
##  $ name    : Factor w/ 4 levels "trump","hillary",..: 1 1 1 1 1 1 1 1 1 1 ...
comparison_most <-  rbind(trump_most, hillary_most, ted_most, bernie_most)

comparison.anno <- gvisAnnotationChart(comparison_most, 
                            datevar="date",
                            numvar="retweet", 
                            idvar="name",
                            options=list(
                              width=700, height=400,
                              fill=10, displayExactValues=TRUE,
                              colors="['blue','red', 'green', 'yellow']")
)
comparison.anno
AnnotationChartID942611c4528e

Data: comparison_most • Chart ID: AnnotationChartID942611c4528e • googleVis-0.5.10
R version 3.2.4 (2016-03-10) • Google Terms of Use • Documentation and Data Policy

Wordcloud

par(mfrow=c(2,2))
wordcloud(trump_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(hillary_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(ted_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))
wordcloud(bernie_clean, random.order = F ,max.words = 50, scale = c(4, 0.5), col = rainbow(50))